import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
!pip install folium
!pip install beautifulsoup4
import requests
import yaml
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from bs4 import BeautifulSoup
from sklearn.neighbors import DistanceMetric
from folium.plugins import MarkerCluster
from folium.plugins import FastMarkerCluster
from folium import plugins
from folium import plugins
from folium.plugins import HeatMap
mpl.style.use('ggplot') # optional: for ggplot-like style
url = 'https://raw.githubusercontent.com/AshVJ/Capstone_Final_Project/master/VICCrimeData.csv'
df = pd.read_csv(url, error_bad_lines=False)
df.shape
df.head()
df.drop(["Year", "Area", "Postcode", "Offence Subdivision", "Offence Subgroup"], axis=1, inplace=True)
df.shape
df.head()
df.rename(columns={"Offence Division": "Offence", "Incidents Recorded": "Incidents"}, inplace=True)
df.dtypes
#Convert the contents of incidents into float, but first remove ,
df.Incidents = df.Incidents.str.replace(",", "")
#Replace all NAN values to 0
df = df.dropna(subset=['Incidents'])
df.dtypes
df["Incidents"] = df["Incidents"].astype(int)
df.dtypes
# lets group the data to remove duplicates
df = df.groupby(['Suburb','Offence'])['Incidents'].sum().reset_index()
df.shape
url2 = 'https://raw.githubusercontent.com/AshVJ/Capstone_Final_Project/master/VicLatLongCSV.csv'
df_ll = pd.read_csv(url2, error_bad_lines=False)
df_ll.shape
# First find the columns names
df.head()
df_ll.head()
#as you can see the column case sensitivity doesn't match across the data frames, so lets fix locatity data frame
df_ll['Locality'] = df_ll['Locality'].str.upper()
df['Suburb'] = df['Suburb'].str.upper()
#next step is to merge both data sets to get lat and long on our crimes data
df_viccrimedata = df.merge(df_ll, left_on='Suburb', right_on='Locality', how='left')
#we can also see the data in suburb acrosss both data
#df_ll.head()
df_viccrimedata.head(10)
#check if there are any column with null values before proceeding
df_viccrimedata.isnull().sum()
#Remove column locality in data frame
df_viccrimedata.drop(["Locality"], axis=1, inplace=True)
df_viccrimedata.head()
#Download data from https://www.domain.com.au/liveable-melbourne/melbournes-most-liveable-suburbs-2019/melbournes-307-suburbs-ranked-for-liveability-2019-898676/
url3 = 'https://raw.githubusercontent.com/AshVJ/Capstone_Final_Project/master/Top307VicSuburbsDomain.csv'
df_top307 = pd.read_csv(url3, error_bad_lines=False)
df_top307.shape
df_top307.head()
df_top307['Suburb'] = df_top307['Suburb'].str.upper()
df_top307.head()
#let us use the subset to get lat long and incidents
df_307CrimeData = df_top307.merge(df, left_on='Suburb', right_on='Suburb', how='left')
df_307CrimeData.drop(["Ranking"], axis=1, inplace=True)
df_307CrimeData.head()
#check if there are any column with null values before proceeding
df_307CrimeData.loc[df_307CrimeData['Incidents'].isnull()]
df_307CrimeData.isnull().sum()
#Let us drop where the values don't match
df_307CrimeData = df_307CrimeData.dropna(subset=['Incidents'])
df_307CrimeData.shape
df_307CrimeData = df_307CrimeData.merge(df_ll, left_on='Suburb', right_on='Locality', how='left')
df_307CrimeData.drop(['Locality'], axis=1, inplace=True)
df_307CrimeData.head()
df_unq_sub = df_307CrimeData[['Latitude','Longitude']].copy()
df_unq_sub.shape
df_unq_sub.sort_values('Latitude', inplace = True)
df_unq_sub.head()
df_unq_sub.drop_duplicates(inplace = True)
df_unq_sub.shape
df_unq_sub.head()
CLIENT_ID = '0UOOVVQUXZTSOJ5302UDTB1L500GIY0RCTXSTUJYXUIFGJXG' # your Foursquare ID
CLIENT_SECRET = 'IBQQ0QYDBQCBCDZ3QBRSVC1JWKU0HGMIIMDVM4DOJW34RGAB' # your Foursquare Secret
VERSION = '20180605'
LIMIT = 30
FourSquare does not actually provide an API that will return a list of the top venues to visit in a city. To get this list we can though use the FourSquare website directly to request the top sites in Melbourne and then use BeautifulSoup to scrape the data we need. Once we have this starting data the other supplemental data we need to complete this dataset can be retrieved from using the FourSquare Venue API.
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
'-37.8142176',
'144.9631608',
radius,
LIMIT)
url # displays the URL
# Send the GET request and examine the resutls
results = requests.get(url).json()
results
# define a function that extracts the category of the venue
def get_category_type(row):
try:
categories_list = row['categories']
except:
categories_list = row['venue.categories']
if len(categories_list) == 0:
return None
else:
return categories_list[0]['name']
from pandas.io.json import json_normalize
# clean the json and structure it into a pandas dataframe.
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head(20)
nearby_venues.shape
def getNearbyVenues(names, latitudes, longitudes, radius=500):
venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):
print(name)
# create the API request URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
LIMIT)
# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
# return only relevant information for each nearby venue
venues_list.append([(
name,
lat,
lng,
v['venue']['name'],
v['venue']['location']['lat'],
v['venue']['location']['lng'],
v['venue']['categories'][0]['name']) for v in results])
nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = ['Suburb',
'Latitude',
'Longitude',
'Venue',
'Venue Latitude',
'Venue Longitude',
'Venue Category']
return(nearby_venues)
df_unq_sub_all = df_307CrimeData[['Suburb','Latitude','Longitude']].copy()
df_unq_sub_all.shape
df_unq_sub_all.drop_duplicates(inplace = True)
df_unq_sub_all.shape
df_unq_sub_20 = df_unq_sub_all.tail(20).copy()
df_unq_sub_20.shape
vic_venues = getNearbyVenues(names=df_unq_sub_20['Suburb'],
latitudes=df_unq_sub_20['Latitude'],
longitudes=df_unq_sub_20['Longitude']
)
vic_venues.shape
vic_venues.head(100)
vic_venues_all = getNearbyVenues(names=df_unq_sub_all['Suburb'],
latitudes=df_unq_sub_all['Latitude'],
longitudes=df_unq_sub_all['Longitude']
)
vic_venues_all.shape
vic_venues_all.head()
df_vic_all = vic_venues_all[['Venue','Latitude','Longitude']].copy()
df_vic_all.head()
Now we have both Crime Data and FourSquare Data - let us start visualising the data we have
df.describe()
#Lets create a new crimes data frame to consolidate the numbers of incidenys irrespective of the type of crime
# we will still need the actual data set for our future analysis
df_viccrime_cons = df_viccrimedata[['Suburb','Incidents','Latitude', 'Longitude']].copy()
df_viccrime_cons.head()
# we do another group by suburbs
df_viccrime_cons = df_viccrime_cons.groupby(['Suburb','Latitude', 'Longitude'])['Incidents'].sum().reset_index()
df_viccrime_cons.head()
df_viccrime_bad = df_307CrimeData.drop(["Latitude", "Longitude"], axis=1)
df_viccrime_gud = df_307CrimeData.drop(["Latitude", "Longitude"], axis=1)
#now group by suburb and incidents
df_viccrime_bad = df_viccrime_bad.groupby(['Suburb'])['Incidents'].sum().reset_index()
df_viccrime_gud = df_viccrime_gud.groupby(['Suburb'])['Incidents'].sum().reset_index()
#now sort this data
df_viccrime_bad = df_viccrime_bad.sort_values(['Incidents'], ascending = False, axis = 0)
df_viccrime_gud = df_viccrime_gud.sort_values(['Incidents'], ascending = True, axis = 0)
#### Top 10 Suburbs with Highesh Crime Rating
df_viccrime_bad.head(10)
#### Top 10 Suburbs with Lowest Crime Rating
df_viccrime_gud.head(10)
df_4plot1 = df_viccrime_bad[['Suburb','Incidents']].head(30)
df_4plot1.set_index('Suburb',inplace = True)
ax = df_4plot1.plot(kind='bar', figsize=(20, 6), rot=0)
ax.set_ylabel('Number of Crimes') # add to x-label to the plot
ax.set_xlabel('Suburb') # add y-label to the plot
ax.set_title('Vic suburbs with the Highest no. of crime') # add title to the plot
# Creating a function to display the percentage.
for p in ax.patches:
ax.annotate(np.round(p.get_height(),decimals=2),
(p.get_x()+p.get_width()/2., p.get_height()),
ha='center',
va='center',
xytext=(0, 10),
textcoords='offset points',
fontsize = 10
)
plt.xticks(rotation=45)
plt.show()
df_4plot1 = df_viccrime_gud[['Suburb','Incidents']].head(30)
df_4plot1.set_index('Suburb',inplace = True)
ax = df_4plot1.plot(kind='bar', figsize=(30, 6), rot=0)
ax.set_ylabel('Number of Crimes') # add to x-label to the plot
ax.set_xlabel('Suburb') # add y-label to the plot
ax.set_title('Vic suburbs with the Highest no. of crime') # add title to the plot
# Creating a function to display the percentage.
for p in ax.patches:
ax.annotate(np.round(p.get_height(),decimals=2),
(p.get_x()+p.get_width()/2., p.get_height()),
ha='center',
va='center',
xytext=(0, 10),
textcoords='offset points',
fontsize = 10
)
plt.xticks(rotation=45)
plt.show()
The data visualised enables us to understand which suburb one would like to live in. Knowing the suburbs with the highest crimes, and lowest crimes, a new family would be able to choose something appropriate for living in
df.Offence.nunique()
# What Crimes are the 10 most commonly occuring ones
df_top10 = df[['Offence', 'Incidents']].groupby(
['Offence'], as_index=False).count().sort_values(
'Incidents', ascending=False).head(10)
# What Crimes are the 3 most commonly occuring ones
df[['Offence', 'Incidents']].groupby(
['Offence'], as_index=False).count().sort_values(
'Incidents', ascending=False).head(3)
df_top10
df_4plot1 = df_top10[['Offence','Incidents']]
df_4plot1.set_index('Offence',inplace = True)
ax = df_4plot1.plot(kind='bar', figsize=(15, 6), rot=0)
ax.set_ylabel('Number of Incidents') # add to x-label to the plot
ax.set_xlabel('Type of Crime') # add y-label to the plot
ax.set_title('Top 10 Crimes in Victoria', loc='left', fontsize=18) # add title to the plot
# Creating a function to display the percentage.
for p in ax.patches:
ax.annotate(np.round(p.get_height(),decimals=2),
(p.get_x()+p.get_width()/2., p.get_height()),
ha='center',
va='center',
xytext=(0, 10),
textcoords='offset points',
fontsize = 10
)
plt.ylim(bottom=0)
plt.xticks(rotation=90)
plt.show()
# create map and display it
VIC_map = folium.Map(location=[-37.8142176, 144.9631608], zoom_start=13)
#-37.8142176, 144.9631608
# display the map of Melbourne VIc
VIC_map
# Create a list of the 3 most commonly occuring crimes
top_ten_crimes = df[['Offence', 'Incidents']].groupby(
['Offence']).count().sort_values('Incidents', ascending=False)[:10].axes[0].tolist()
# Create a list of the 3 most commonly occuring crimes
top_three_crimes = df[['Offence', 'Incidents']].groupby(
['Offence']).count().sort_values('Incidents', ascending=False)[:3].axes[0].tolist()
# Create a list of 10 colours.
# We have list of the top 10 crimes from earlier
colors10 = [
'red',
'blue',
'gray',
'orange',
'beige',
'green',
'purple',
'pink',
'cadetblue',
'black'
]
# Create a dictionary of colours to map to the crimes
dict_colours10 = dict(zip(top_ten_crimes, colors10))
# Create a list of 3 colours.
# We have list of the top 3 crimes from earlier
colors3 = [
'red',
'blue',
'green'
]
# Create a dictionary of colours to map to the crimes
dict_colours3 = dict(zip(top_three_crimes, colors3))
# Create a new data frame with just the top 10 crimes
df_top10_crimes = df[df['Offence'].isin(top_ten_crimes)].copy()
# Create a new data frame with just the top 3 crimes
df_top3_crimes = df[df['Offence'].isin(top_three_crimes)].copy()
#create a copy of the data for 307 suburbs, lets us not map it for all of victoria
df_307Color = df_307CrimeData.copy()
# Add the colours colums to the df_top_crimes DataFrame
df_307Color['colour'] = df_top10_crimes.Offence.map(dict_colours10)
#create a copy of the data for all suburbs, lets us map it for all of victoria
df_Color = df_viccrimedata
# Add the colours colums to the df_top_crimes DataFrame
df_Color['colour'] = df_top10_crimes.Offence.map(dict_colours10)
df_307Color.head()
df_Color.head()
# create map and display it
VIC_map = folium.Map(location=[-37.8142176, 144.9631608], zoom_start=8)
#-37.8142176, 144.9631608
# display the map of Melbourne VIc
VIC_map
# Instantiate a feature group for the incidents in the dataframe
incidents = folium.map.FeatureGroup()
# loop through the August crimes and add each to the incidents feature group
for lat, lng, col in zip(df_307Color.Latitude,
df_307Color.Longitude,
df_307Color.colour):
incidents.add_child(
folium.CircleMarker(
[lat, lng],
radius=1, # define how big you want the circle markers to be
color=col,
fill=True,
fill_color=col,
fill_opacity=0.6
)
)
# add incidents to map
VIC_map.add_child(incidents)
mc = MarkerCluster()
# Define the world map centered around Melbourne with a higher zoom level
VIC_clust = folium.Map(location=[-37.8142176, 144.9631608], zoom_start=11)
# display world map
VIC_clust
#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in df_307Color.itertuples():
mc.add_child(folium.Marker(
location=[row.Latitude, row.Longitude],
popup=row.Offence))
VIC_clust.add_child(mc)
VIC_clust
VIC_heat = folium.Map(location=[-37.8142176, 144.9631608], zoom_start=11)
# List comprehension to make out list of lists
heat_data = [[row['Latitude'],
row['Longitude']] for index, row in df_307Color.iterrows()]
# Plot it on the map
HeatMap(heat_data,
min_opacity=0.5,
max_zoom=18,
max_val=1.0,
radius=15,
blur=20,
gradient=None,
overlay=True).add_to(VIC_heat)
# Display the map
VIC_heat
# create map and display it
VIC_map = folium.Map(location=[-37.8142176, 144.9631608], zoom_start=10)
#-37.8142176, 144.9631608
# display the map of Melbourne VIc
VIC_map
# Instantiate a feature group for the incidents in the dataframe
incidents = folium.map.FeatureGroup()
# loop through the August crimes and add each to the incidents feature group
for lat, lng, col in zip(df_Color.Latitude,
df_Color.Longitude,
df_Color.colour):
incidents.add_child(
folium.CircleMarker(
[lat, lng],
radius=1, # define how big you want the circle markers to be
color=col,
fill=True,
fill_color=col,
fill_opacity=0.6
)
)
# add incidents to map
VIC_map.add_child(incidents)
mc = MarkerCluster()
# Define the world map centered around Melbourne with a higher zoom level
VIC_clust = folium.Map(location=[-37.8142176, 144.9631608], zoom_start=10)
# display world map
VIC_clust
#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in df_Color.itertuples():
mc.add_child(folium.Marker(
location=[row.Latitude, row.Longitude],
popup=row.Offence))
VIC_clust.add_child(mc)
VIC_clust
VIC_heat = folium.Map(location=[-37.8142176, 144.9631608], zoom_start=9)
# List comprehension to make out list of lists
heat_data = [[row['Latitude'],
row['Longitude']] for index, row in df_Color.iterrows()]
# Plot it on the map
HeatMap(heat_data,
min_opacity=0.5,
max_zoom=18,
max_val=1.0,
radius=15,
blur=20,
gradient=None,
overlay=True).add_to(VIC_heat)
# Display the map
VIC_heat
#Let collect crime data for the 307 suburbs
df_crime_all = df_307CrimeData[['Latitude', 'Longitude', 'Incidents','Suburb']].copy()
df_crime_all.head()
#lets group the incident count per location
df_crime_all = df_crime_all.groupby(['Latitude','Longitude', 'Suburb'])['Incidents'].sum().reset_index()
df_crime_all.shape
df_crime_all.head()
df_crime_all['InsSub'] = df_crime_all['Suburb'] + '-' + df_crime_all['Incidents'].astype(str)
df_crime_all.head()
df_crime_all.rename(columns={"Latitude": "latitude", "Longitude": "longitude"}, inplace=True)
df_crime_all.head()
df_vic_all.rename(columns={"Venue": "name", "Latitude": "latitude","Longitude": "longitude"}, inplace=True)
df_vic_all.head()
# Create the Folium Map
vic_heatmat = folium.Map(location=[-37.8142176, 144.9631608], zoom_start=8)
# List comprehension to make out list of lists of Crime Loatitude and Longitude
heat_data = [[row['latitude'],
row['longitude']] for index, row in df_crime_all.iterrows()]
# Plot the crimes on the map
HeatMap(heat_data,
min_opacity=0.5,
max_zoom=18,
max_val=1.0,
radius=20,
blur=30,
gradient=None,
overlay=True).add_to(vic_heatmat)
# Add the Venue to the Map
folium.Marker(
location=[-37.8142176, 144.9631608],
popup='Melbourne',
icon=folium.Icon(color='red', icon='thumbs-up')
).add_to(vic_heatmat)
# Add the incidents to the map
for row in df_crime_all.itertuples():
popup_text = '<h4>' + str(row.InsSub) + '</h4>'
popup = folium.Popup(popup_text)
folium.Marker([row.latitude, row.longitude],
popup=popup,
icon=folium.Icon(color='red', icon='info-sign')
).add_to(vic_heatmat)
# Add the Restaurants to the map
for row in df_vic_all.itertuples():
popup_text = '<h3>' + str(row.name) + '</h3>'
popup = folium.Popup(popup_text)
folium.Marker([row.latitude, row.longitude],
popup=popup,
icon=folium.Icon(color='blue', icon='info-sign')
).add_to(vic_heatmat)
# Display the map
vic_heatmat
Inferential statistics allows us to provide insight on a given topic. There are many types of statistical tests that allows one to make inferences. Some of the common statistical tests are:
Correlations Chi-square test Independent t-test (a.k.a Student’s t-test) Paired sample t-test Welch’s t-test Wilcoxon signed-rank test Linear regression Logistic regression One-way Analysis of Variance (ANOVA) Two-way/N-way ANOVA In this section we will investigate if there are any obvious inferential statisticical methods that can help us when modelling the data.
#df_restyy1
df_307CrimeData.head()
df_crimeward_crosstab = pd.crosstab(df_307CrimeData.Offence, df_307CrimeData.Suburb)
df_crimeward_crosstab.head()
df_top_venues.head()
df_top_venues.plot.scatter('score', 'latitude', figsize=(10,6))
plt.xlabel('Score')
plt.ylabel('Latitude')
plt.title('Scatter Plot of Top Venues Latitude and Score', loc='left', fontsize=18)
df_top_venues.plot.scatter('score', 'longitude', figsize=(10,6))
plt.xlabel('Score')
plt.ylabel('Longitude')
plt.title('Scatter Plot of Top Venues Longitude and Score', loc='left', fontsize=18)
In this section multiple models will be created and evaluated before a final model is chosen and evaluated
# All the SciKit Learn Libraries Required
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score
# Decision Tree
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
# Small Function to do X-Fold Cross Validation
def cross_validate(model, n_splits = 10):
k_fold = KFold(n_splits = n_splits)
scores = [model.fit(X[train], y[train]).score(X[test], y[test]) for train, test in k_fold.split(X)]
scores = np.percentile(scores, [40, 50, 60])
return scores
Before we start modelling we need to prepare the data frame to include only mumerical data and by removing unneeded columns.
vic_venues_all.head()
vic_venues_all.groupby('Suburb').count()
# Calculate how many unique categories there are.
print('There are {} unique venue categories.'.format(len(vic_venues_all['Venue Category'].unique())))
# Analyze each of the Suburb from the results
# one hot encoding
vic_onehot = pd.get_dummies(vic_venues_all[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
vic_onehot['Suburb'] = vic_venues_all['Suburb']
# move neighborhood column to the first column
fixed_columns = [vic_onehot.columns[-1]] + list(vic_onehot.columns[:-1])
vic_onehot = vic_onehot[fixed_columns]
vic_onehot
vic_onehot.shape
vic_grouped = vic_onehot.groupby('Suburb').mean().reset_index()
vic_grouped
vic_grouped.shape
# Each Suburb with top 10 most common venues
num_top_venues = 10
for hood in vic_grouped['Suburb']:
print("----"+hood+"----")
temp = vic_grouped[vic_grouped['Suburb'] == hood].T.reset_index()
temp.columns = ['venue','freq']
temp = temp.iloc[1:]
temp['freq'] = temp['freq'].astype(float)
temp = temp.round({'freq': 2})
print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
print('\n')
# First sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
row_categories = row.iloc[1:]
row_categories_sorted = row_categories.sort_values(ascending=False)
return row_categories_sorted.index.values[0:num_top_venues]
This is a very useful results table that can provide at a information for all of the suburbs withing Victoria in a glance.
# create the new dataframe and display the top 10 venues for each neighborhood
num_top_venues = 10
indicators = ['st', 'nd', 'rd']
# create columns according to number of top venues
columns = ['Suburb']
for ind in np.arange(num_top_venues):
try:
columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
except:
columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
vic_venues_sorted = pd.DataFrame(columns=columns)
vic_venues_sorted['Suburb'] = vic_grouped['Suburb']
for ind in np.arange(vic_grouped.shape[0]):
vic_venues_sorted.iloc[ind, 1:] = return_most_common_venues(vic_grouped.iloc[ind, :], num_top_venues)
vic_venues_sorted.head(20)
# The shape of the sorted data is
vic_venues_sorted.shape
# import k-means from clustering stage
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 5
kut_grouped_clustering = vic_grouped.drop('Suburb', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(kut_grouped_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]
# add clustering labels
vic_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
viv_merged = vic_venues_all
# merge vic groped with vic data to add latitude/longitude for each suburb
viv_merged = viv_merged.join(vic_venues_sorted.set_index('Suburb'), on='Suburb')
viv_merged.head() # check the last columns!
viv_merged.info()
# Dropping the row with the NaN value
viv_merged.dropna(inplace = True)
viv_merged.shape
viv_merged['Cluster Labels'] = viv_merged['Cluster Labels'].astype(int)
viv_merged.info()
# create map
map_clusters = folium.Map(location=['-37.8142176', '144.9631608'], zoom_start=11.5)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(viv_merged['Latitude'], viv_merged['Longitude'], viv_merged['Suburb'], viv_merged['Cluster Labels']):
label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
folium.CircleMarker(
[lat, lon],
radius=8,
popup=label,
color=rainbow[cluster-1],
fill=True,
fill_color=rainbow[cluster-1],
fill_opacity=0.5).add_to(map_clusters)
map_clusters
Each cluster is color coded for the ease of presentation, we can see that majority of the suburbs falls in the red cluster which is the first cluster. Let us now Analyse each of the clusters to identify the characteristics of each cluster and the suburb in them.
viv_merged[viv_merged['Cluster Labels'] == 0]
The first cluster has the maximum number of records and containts venues category related to restaurants. This is the biggest of clusters
viv_merged[viv_merged['Cluster Labels'] == 1]
The second cluster is considerably smaller and primarily focusses on outdoor activity and music
viv_merged[viv_merged['Cluster Labels'] == 2]
The third cluser focussses on sports and gaming, it is a relatively small cluster will only 4 records
viv_merged[viv_merged['Cluster Labels'] == 3]
The fourth cluster focusses on furniture stores, golf coures, women's store, sports store and bar.
viv_merged[viv_merged['Cluster Labels'] == 4]
The fith cluster mainly contains areas like parks and activities
Our target audience is young couples, migrants and new families. From the above, let us make an assumption on what our target audience will be looking for when settling down
Lets start visualising this data
vic_ven_sort3 = vic_venues_sorted[['Suburb', '1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue']].copy()
vic_ven_sort3
# Lets Start with Young Couples, married couples and migrants
df_yc1 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['1st Most Common Venue'] == 'Pub')])
df_yc2 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['1st Most Common Venue'] == 'Pizza Place')])
df_yc3 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['1st Most Common Venue'] == 'Café')])
df_yc4 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['1st Most Common Venue'] == 'Grocery Store')])
df_yc5 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['1st Most Common Venue'] == 'Pharmacy')])
df_yc6 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['1st Most Common Venue'] == 'Train Station')])
df_yc7 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['1st Most Common Venue'] == 'Playground')])
df_yc8 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['1st Most Common Venue'] == 'Bakery')])
df_yc9 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['2nd Most Common Venue'] == 'Pub')])
df_yc10 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['2nd Most Common Venue'] == 'Pizza Place')])
df_yc11 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['2nd Most Common Venue'] == 'Café')])
df_yc12 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['2nd Most Common Venue'] == 'Grocery Store')])
df_yc13 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['2nd Most Common Venue'] == 'Pharmacy')])
df_yc14 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['2nd Most Common Venue'] == 'Train Station')])
df_yc15 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['2nd Most Common Venue'] == 'Playground')])
df_yc16 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['2nd Most Common Venue'] == 'Bakery')])
df_yc17 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['3rd Most Common Venue'] == 'Pub')])
df_yc18 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['3rd Most Common Venue'] == 'Pizza Place')])
df_yc19 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['3rd Most Common Venue'] == 'Café')])
df_yc20 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['3rd Most Common Venue'] == 'Grocery Store')])
df_yc21 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['3rd Most Common Venue'] == 'Pharmacy')])
df_yc22 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['3rd Most Common Venue'] == 'Train Station')])
df_yc23 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['3rd Most Common Venue'] == 'Playground')])
df_yc24 = pd.DataFrame(vic_ven_sort3.loc[(vic_ven_sort3['3rd Most Common Venue'] == 'Bakery')])
df_yc = pd.concat([df_yc1, df_yc2, df_yc3, df_yc4, df_yc5, df_yc6, df_yc7, df_yc8, df_yc9, df_yc10, df_yc11, df_yc12, df_yc13, df_yc14, df_yc15, df_yc16, df_yc17, df_yc18, df_yc19, df_yc20, df_yc21, df_yc22, df_yc23, df_yc24])
df_yc.drop_duplicates(keep=False,inplace=True)
df_yc.shape
df_yc.head(100)
#Now, lets combine this data sets with incidents data set
df_good_subs = df_yc.merge(df_viccrime_gud, left_on='Suburb', right_on='Suburb', how='left')
df_good_subs = df_good_subs.sort_values(['Suburb'], ascending = True, axis = 0)
df_4plot1 = df_good_subs[['Suburb','Incidents']]
df_4plot1.set_index('Suburb',inplace = True)
ax = df_4plot1.plot(kind='bar', figsize=(40, 6), rot=0)
ax.set_ylabel('Number of Crimes') # add to x-label to the plot
ax.set_xlabel('Suburb') # add y-label to the plot
ax.set_title('Vic suburbs breakdown') # add title to the plot
# Creating a function to display the percentage.
for p in ax.patches:
ax.annotate(np.round(p.get_height(),decimals=2),
(p.get_x()+p.get_width()/2., p.get_height()),
ha='center',
va='center',
xytext=(0, 10),
textcoords='offset points',
fontsize = 10
)
plt.xticks(rotation=90)
plt.show()
df_tot = df_good_subs[['Suburb','Incidents']]
df_tot.head()
#df_tot.plot(kind='scatter', x=df_tot.index, y='Incidents', figsize=(10, 6), color='darkblue')
plt.figure(figsize=(20,8), dpi=80)
plt.scatter(df_tot.Suburb, df_tot.Incidents)
plt.title('Victoria - Suburbs and Crime Breakdown')
plt.xlabel('Suburbs')
plt.ylabel('Number of Incidents')
plt.xticks(rotation=90)
plt.show()
The aim of this project is to assist young couples, new families, and migrants to come to Victoria and stay in a safe and convenient location. Say for example, as an young couple, they are looking for party life and what not, they will be focussing on cluster 1, we have also identified the possible suburbs they would be intereted in using their first, second and third choice. If a family is planning to move in an area which has playgrounds and heath fitness venues, they would probably consider cluster 2, 4 and 5. While someone who is super exited about Motor bikes would actually choose a suburb in cluster 3.These are some of the findings with our specific target audience. We have also only considered suburbs which are within Melbourne's reach and have been listed in the top 300 suburbs to live in Victoria. Any one should be able to view these clusters and get the outcomes they need.
In the current state of affairs, the most important thing for a human is to make effective decisions considering all the factors which will impact the decision. Doing this without the help of technology is not feasible in this day and age. The above project actually enables a person to use this project and make a informed decision based on two key factors - 1. SAFETY and 2. INTEREST. The future of this project will include other factors like Rent, public transport ratings, school ratings and most of all age group of people living in that suburb.